import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
# Get data path
data_path = f"{os.path.dirname(os.getcwd())}/data"
print(data_path)
/Users/shawnguyen/Desktop/national-exam-scores-visuals/ds/data
grades = pd.read_csv(f'{data_path}/diemthi2021.csv').sort_values('CityCode')
grades
| Unnamed: 0 | ID | CityCode | CityArea | Math | Literature | English | Physics | Chemistry | Biology | ... | Civic Education | A | B | C | A1 | D1 | A-BK | A1-BK | Highest_combi | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1000001 | 1 | 1.0 | 2.2 | 3.50 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 11.50 | NaN | NaN | NaN | NaN | C | 2021 |
| 67134 | 67455 | 1067456 | 1 | 1.0 | 5.8 | 5.25 | 2.2 | NaN | NaN | NaN | ... | 8.25 | NaN | NaN | 18.75 | NaN | 13.25 | NaN | NaN | C | 2021 |
| 67133 | 67454 | 1067455 | 1 | 1.0 | 8.4 | 8.00 | 9.4 | NaN | NaN | NaN | ... | 8.25 | NaN | NaN | 21.25 | NaN | 25.80 | NaN | NaN | D1 | 2021 |
| 67132 | 67453 | 1067454 | 1 | 1.0 | 5.4 | 6.25 | 5.0 | NaN | NaN | NaN | ... | 7.25 | NaN | NaN | 14.00 | NaN | 16.65 | NaN | NaN | D1 | 2021 |
| 67131 | 67452 | 1067453 | 1 | 1.0 | 4.0 | 3.75 | 2.2 | NaN | NaN | NaN | ... | 7.25 | NaN | NaN | 10.50 | NaN | 9.95 | NaN | NaN | C | 2021 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 997721 | 1016533 | 64002270 | 64 | 6.0 | 6.0 | 7.00 | 7.6 | 6.25 | 4.25 | 5.25 | ... | NaN | 16.5 | 15.5 | NaN | 19.85 | 20.60 | 16.875 | 19.3875 | D1 | 2021 |
| 997720 | 1016532 | 64002269 | 64 | 6.0 | 4.2 | 4.25 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2021 |
| 997719 | 1016531 | 64002268 | 64 | 6.0 | 8.0 | 6.75 | 6.4 | NaN | NaN | NaN | ... | 9.00 | NaN | NaN | 22.00 | NaN | 21.15 | NaN | NaN | C | 2021 |
| 997717 | 1016529 | 64002266 | 64 | 6.0 | 4.8 | 8.00 | 3.4 | NaN | NaN | NaN | ... | 8.75 | NaN | NaN | 19.00 | NaN | 16.20 | NaN | NaN | C | 2021 |
| 1002269 | 1021101 | 64006838 | 64 | 6.0 | 8.0 | 7.75 | 8.2 | NaN | NaN | NaN | ... | 7.50 | NaN | NaN | 18.50 | NaN | 23.95 | NaN | NaN | D1 | 2021 |
1002270 rows × 22 columns
province_grades = pd.read_csv(f'{data_path}/Province_Avg2021.csv').sort_values('CityCode')
province_grades
| CityCode | Area | Math | Literature | English | Physic | Chemistry | Biology | Geography | History | Civic Education | A | B | C | A1 | D1 | A-BK | A1-BK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 6.951270 | 6.787373 | 6.463423 | 6.747615 | 6.291846 | 5.033329 | 6.777886 | 4.854949 | 8.171540 | 21.182955 | 19.502103 | 18.501420 | 21.854673 | 20.656833 | 22.001291 | 22.514281 |
| 49 | 2 | 5 | 7.160407 | 6.479206 | 7.228478 | 6.283695 | 6.533958 | 5.577198 | 7.070528 | 5.269874 | 8.588421 | 20.423466 | 19.734384 | 18.833810 | 21.456517 | 21.062123 | 21.031506 | 21.829240 |
| 5 | 3 | 1 | 6.967891 | 6.365921 | 6.508832 | 6.924992 | 6.548417 | 5.525145 | 7.144452 | 5.143850 | 8.831570 | 21.557381 | 20.164479 | 18.539329 | 22.255269 | 20.296301 | 22.233402 | 22.757126 |
| 31 | 4 | 3 | 6.848693 | 5.590065 | 6.450089 | 6.520510 | 6.452467 | 5.186634 | 6.561234 | 4.525579 | 8.011578 | 20.849575 | 19.568400 | 16.723309 | 21.416780 | 19.225303 | 21.553551 | 21.988156 |
| 11 | 5 | 2 | 4.915043 | 5.320990 | 4.173597 | 6.351534 | 6.383816 | 5.322719 | 6.394798 | 4.370715 | 7.585469 | 20.308195 | 19.394867 | 16.023203 | 20.218956 | 14.561329 | 20.929489 | 20.906867 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 61 | 60 | 6 | 6.799120 | 6.852512 | 5.851652 | 6.532062 | 6.704327 | 6.068955 | 7.336867 | 5.566085 | 8.717433 | 20.920321 | 20.466723 | 19.673370 | 20.827611 | 19.660974 | 21.452997 | 21.393750 |
| 62 | 61 | 6 | 6.307506 | 6.417206 | 5.165218 | 6.009154 | 6.195559 | 5.739823 | 7.122315 | 5.157759 | 8.650025 | 19.431882 | 19.163213 | 18.572888 | 19.120035 | 17.925831 | 19.996578 | 19.763931 |
| 21 | 62 | 2 | 5.570078 | 6.405598 | 4.486407 | 6.136446 | 6.732503 | 5.703914 | 6.865908 | 5.011578 | 8.051588 | 20.560667 | 20.191414 | 18.180503 | 19.819961 | 16.624004 | 21.203955 | 20.663865 |
| 42 | 63 | 4 | 6.174730 | 5.508003 | 4.768368 | 6.171119 | 6.360659 | 5.282128 | 6.844336 | 4.950118 | 8.005086 | 20.037930 | 19.178842 | 17.188822 | 19.385118 | 16.523130 | 20.667805 | 20.187190 |
| 59 | 64 | 6 | 6.041824 | 6.275926 | 4.589736 | 6.122096 | 6.543271 | 5.742384 | 6.984076 | 4.860245 | 8.385452 | 19.918906 | 19.594501 | 18.008589 | 18.828684 | 17.022804 | 20.373501 | 19.584586 |
63 rows × 18 columns
province_info = pd.read_excel(f'{data_path}/ProvinceInfo.xlsx').sort_values('CityCode')
display(province_info)
| CityName | CityCode | Area | Monthly Income | Province Specialized Schools | Ethnic Pupils Ratio (%) | Students/ Teacher | Urban Ratio | Poverty Rate | Covid 19 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hà Nội | 1 | 1 | 6205.45 | 8 | 2.12 | 21.93 | 49.25 | 0.4904 | 14421 |
| 49 | Thành phố Hồ Chí Minh | 2 | 5 | 6536.88 | 2 | 5.59 | 19.25 | 80.10 | 0.0000 | 91981 |
| 5 | Hải Phòng | 3 | 1 | 5199.40 | 1 | 0.11 | 21.46 | 45.41 | 0.9488 | 233 |
| 31 | Đà Nẵng | 4 | 3 | 5283.63 | 1 | 0.41 | 17.41 | 87.25 | 0.5066 | 6331 |
| 11 | Hà Giang | 5 | 2 | 1834.55 | 1 | 77.56 | 16.23 | 15.85 | 26.9815 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 61 | Bạc Liêu | 60 | 6 | 3338.48 | 1 | 7.79 | 23.37 | 27.88 | 6.6718 | 51 |
| 62 | Cà Mau | 61 | 6 | 3034.40 | 1 | 2.41 | 18.36 | 22.71 | 5.9179 | 13 |
| 21 | Điện Biên | 62 | 2 | 1737.12 | 1 | 79.39 | 16.15 | 14.36 | 36.7358 | 1798 |
| 42 | Đắk Nông | 63 | 4 | 2808.54 | 1 | 22.85 | 17.52 | 15.28 | 9.0347 | 0 |
| 59 | Hậu Giang | 64 | 6 | 3974.29 | 1 | 3.08 | 19.38 | 27.97 | 5.5990 | 1 |
63 rows × 10 columns
def student_in_range(a, b, subject):
subject_ab = []
for citycode in sorted(list(set(grades['CityCode'].values.tolist()))):
subject_ab.append(len(grades[(grades['CityCode']==citycode) & (grades[subject]>a) & (grades[subject]<=b)]))
return subject_ab
def plot_subject(subject):
data = province_grades[['CityCode', 'Area']]
data['0-2'] = student_in_range(-1e-9, 2, subject)
data['2-4'] = student_in_range(2, 4, subject)
data['4-6'] = student_in_range(4, 6, subject)
data['6-8'] = student_in_range(6, 8, subject)
data['8-10'] = student_in_range(8, 10, subject)
data['0-10'] = data['0-2']+data['2-4']+data['4-6']+data['6-8']+data['8-10']
data['p02'] = data['0-2']/data['0-10']*100
data['p24'] = data['2-4']/data['0-10']*100
data['p46'] = data['4-6']/data['0-10']*100
data['p68'] = data['6-8']/data['0-10']*100
data['p810'] = data['8-10']/data['0-10']*100
data = data[['CityCode', 'Area', 'p02', 'p24', 'p46', 'p68', 'p810']]
display(data)
fig = px.parallel_coordinates(
data,
color = "Area",
labels = {'p02': '0->2', 'p24': '2->4', 'p46': '4->6', 'p68': '6->8', 'p810': '8->10'},
color_continuous_scale=px.colors.diverging.Tealrose,
color_continuous_midpoint=3,
title=f"PROVINCES {subject.upper()} GRADES PROPORTION (%) DISTRIBUTION - 2021"
)
fig.show()
plot_subject('Math')
/var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['0-2'] = student_in_range(-1e-9, 2, subject) /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['2-4'] = student_in_range(2, 4, subject) /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['4-6'] = student_in_range(4, 6, subject) /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['6-8'] = student_in_range(6, 8, subject) /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['8-10'] = student_in_range(8, 10, subject) /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['0-10'] = data['0-2']+data['2-4']+data['4-6']+data['6-8']+data['8-10'] /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['p02'] = data['0-2']/data['0-10']*100 /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['p24'] = data['2-4']/data['0-10']*100 /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['p46'] = data['4-6']/data['0-10']*100 /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['p68'] = data['6-8']/data['0-10']*100 /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['p810'] = data['8-10']/data['0-10']*100
| CityCode | Area | p02 | p24 | p46 | p68 | p810 | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.721498 | 8.432819 | 17.747246 | 42.346219 | 30.752219 |
| 49 | 2 | 5 | 0.066356 | 2.196740 | 16.343423 | 57.450524 | 23.942957 |
| 5 | 3 | 1 | 0.499573 | 6.806149 | 18.714774 | 45.550811 | 28.428693 |
| 31 | 4 | 3 | 0.956633 | 7.900191 | 18.654337 | 47.297513 | 25.191327 |
| 11 | 5 | 2 | 2.997624 | 35.660757 | 34.271614 | 20.928532 | 6.141473 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 61 | 60 | 6 | 0.081460 | 3.861193 | 23.395243 | 56.630824 | 16.031281 |
| 62 | 61 | 6 | 0.324314 | 9.368050 | 31.773536 | 47.081171 | 11.452928 |
| 21 | 62 | 2 | 0.898001 | 19.942392 | 41.579126 | 30.430363 | 7.150119 |
| 42 | 63 | 4 | 0.608921 | 13.122241 | 31.405084 | 41.802405 | 13.061349 |
| 59 | 64 | 6 | 0.356453 | 12.475865 | 35.972078 | 42.714986 | 8.480618 |
63 rows × 7 columns
plot_subject('English')
/var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/ln/rz3xh60x0zbb7cfm8z1lrlm40000gn/T/ipykernel_81620/2560017616.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| CityCode | Area | p02 | p24 | p46 | p68 | p810 | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1.717851 | 24.010917 | 17.763354 | 19.101626 | 37.406252 |
| 49 | 2 | 5 | 0.315108 | 8.815115 | 18.053450 | 30.204227 | 42.612101 |
| 5 | 3 | 1 | 0.880827 | 17.945055 | 24.558396 | 24.029900 | 32.585821 |
| 31 | 4 | 3 | 1.310198 | 19.900850 | 21.804178 | 24.212110 | 32.772663 |
| 11 | 5 | 2 | 5.951668 | 55.322548 | 23.746755 | 8.647893 | 6.331136 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 61 | 60 | 6 | 0.437139 | 17.415632 | 40.304249 | 28.309145 | 13.533835 |
| 62 | 61 | 6 | 1.382183 | 32.007952 | 37.621888 | 19.757645 | 9.230332 |
| 21 | 62 | 2 | 1.666036 | 46.989777 | 36.842105 | 8.670958 | 5.831125 |
| 42 | 63 | 4 | 3.020188 | 42.997934 | 30.980766 | 13.876967 | 9.124146 |
| 59 | 64 | 6 | 2.966102 | 45.856874 | 31.732580 | 13.214062 | 6.230383 |
63 rows × 7 columns
data = province_info[['Monthly Income', 'Province Specialized Schools', 'Ethnic Pupils Ratio (%)', 'Students/ Teacher', 'Urban Ratio', 'Poverty Rate']]
normalized_data = (data-data.min())/(data.max()-data.min())
X = normalized_data.values
X.shape
(63, 6)
kmeans = KMeans(n_clusters = 2, init = 'random', n_init = 10, tol = 1e-3, random_state = 2021).fit(X)
normalized_data[['CityCode']] = province_info[['CityCode']]
normalized_data['Label'] = kmeans.labels_
province_grades['Label'] = kmeans.labels_
province_info['Label'] = kmeans.labels_
normalized_data[normalized_data['Label'] == 0]
| Monthly Income | Province Specialized Schools | Ethnic Pupils Ratio (%) | Students/ Teacher | Urban Ratio | Poverty Rate | CityCode | Label | |
|---|---|---|---|---|---|---|---|---|
| 11 | 0.018395 | 0.0 | 0.813629 | 0.170874 | 0.078115 | 0.734474 | 5 | 0 |
| 12 | 0.084504 | 0.0 | 1.000000 | 0.120388 | 0.178696 | 0.706815 | 6 | 0 |
| 22 | 0.031660 | 0.0 | 0.828013 | 0.281553 | 0.099419 | 0.839367 | 7 | 0 |
| 15 | 0.130256 | 0.0 | 0.625472 | 0.299029 | 0.179600 | 0.418671 | 8 | 0 |
| 14 | 0.178205 | 0.0 | 0.602373 | 0.339806 | 0.051904 | 0.351238 | 9 | 0 |
| 18 | 0.105612 | 0.0 | 0.889857 | 0.139806 | 0.170949 | 0.333781 | 10 | 0 |
| 13 | 0.085850 | 0.0 | 0.941201 | 0.164078 | 0.161007 | 0.584378 | 11 | 0 |
| 16 | 0.148271 | 0.0 | 0.488975 | 0.572816 | 0.136346 | 0.410077 | 13 | 0 |
| 23 | 0.001490 | 0.0 | 0.828538 | 0.736893 | 0.052421 | 0.831042 | 14 | 0 |
| 24 | 0.183159 | 0.0 | 0.745170 | 0.205825 | 0.181407 | 0.247388 | 23 | 0 |
| 39 | 0.120478 | 0.0 | 0.379357 | 0.000000 | 0.296191 | 0.599794 | 36 | 0 |
| 21 | 0.000000 | 0.0 | 0.832843 | 0.163107 | 0.058877 | 1.000000 | 62 | 0 |
labels_list = list(kmeans.labels_)
print(labels_list)
print(labels_list.count(0))
print(labels_list.count(1))
display(province_info[['CityCode', 'Label']][province_info['Label']==1])
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1] 12 51
| CityCode | Label | |
|---|---|---|
| 0 | 1 | 1 |
| 49 | 2 | 1 |
| 5 | 3 | 1 |
| 31 | 4 | 1 |
| 17 | 12 | 1 |
| 20 | 15 | 1 |
| 1 | 16 | 1 |
| 3 | 17 | 1 |
| 19 | 18 | 1 |
| 2 | 19 | 1 |
| 4 | 21 | 1 |
| 6 | 22 | 1 |
| 8 | 24 | 1 |
| 9 | 25 | 1 |
| 7 | 26 | 1 |
| 10 | 27 | 1 |
| 25 | 28 | 1 |
| 26 | 29 | 1 |
| 27 | 30 | 1 |
| 28 | 31 | 1 |
| 29 | 32 | 1 |
| 30 | 33 | 1 |
| 32 | 34 | 1 |
| 33 | 35 | 1 |
| 34 | 37 | 1 |
| 40 | 38 | 1 |
| 35 | 39 | 1 |
| 41 | 40 | 1 |
| 36 | 41 | 1 |
| 43 | 42 | 1 |
| 44 | 43 | 1 |
| 46 | 44 | 1 |
| 37 | 45 | 1 |
| 45 | 46 | 1 |
| 38 | 47 | 1 |
| 47 | 48 | 1 |
| 50 | 49 | 1 |
| 55 | 50 | 1 |
| 56 | 51 | 1 |
| 48 | 52 | 1 |
| 51 | 53 | 1 |
| 57 | 54 | 1 |
| 58 | 55 | 1 |
| 52 | 56 | 1 |
| 54 | 57 | 1 |
| 53 | 58 | 1 |
| 60 | 59 | 1 |
| 61 | 60 | 1 |
| 62 | 61 | 1 |
| 42 | 63 | 1 |
| 59 | 64 | 1 |
def plot_radar(data, range_x):
categories = list(data.columns)
fig = go.Figure()
for i in range(data.shape[0]):
fig.add_trace(go.Scatterpolar(
r=list(data.loc[i].values),
theta=categories,
fill='toself',
name=f'Label {i}'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=range_x,
)),
showlegend=False
)
fig.show()
radar_data_1 = normalized_data.drop(['CityCode', 'Province Specialized Schools'], axis=1).groupby('Label').mean()
display(radar_data_1)
plot_radar(radar_data_1, [0,1])
| Monthly Income | Ethnic Pupils Ratio (%) | Students/ Teacher | Urban Ratio | Poverty Rate | |
|---|---|---|---|---|---|
| Label | |||||
| 0 | 0.090657 | 0.747953 | 0.266181 | 0.137078 | 0.588085 |
| 1 | 0.421242 | 0.078608 | 0.420864 | 0.299103 | 0.116899 |
radar_data_2 = province_grades.drop(["CityCode", "Area", 'A', 'B', 'C', 'A1', 'D1', 'A-BK', 'A1-BK'], axis=1).groupby('Label').mean()
display(radar_data_2)
plot_radar(radar_data_2, [0,9])
| Math | Literature | English | Physic | Chemistry | Biology | Geography | History | Civic Education | |
|---|---|---|---|---|---|---|---|---|---|
| Label | |||||||||
| 0 | 5.659077 | 6.059589 | 4.819625 | 6.499246 | 6.63131 | 5.647756 | 6.881958 | 4.919888 | 8.227905 |
| 1 | 6.614581 | 6.438082 | 5.750278 | 6.580169 | 6.67044 | 5.573001 | 6.994920 | 5.007699 | 8.417800 |